import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data = pd.read_excel("C:/Users/Topsheed/Desktop/HotelDataset.xlsx")
data
| Booking ID | Date of Booking | Time | Customer ID | Gender | Age | Origin Country | State | Location | Destination Country | ... | Check-Out Date | Rooms | Hotel Name | Hotel Rating | Payment Mode | Bank Name | Booking Price[SGD] | Discount | GST | Profit Margin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DDMY00001 | 2010-01-01 | 10:49:40 | MY00001 | Male | 42 | Malaysia | Johor | Iskandar Puteri | Denmark | ... | 2010-01-20 | 1 | Hotel Triton | 4.3 | Wallet | United Overseas Bank (UOB) | 243 | 0.01 | 0.07 | 0.25 |
| 1 | DDID00002 | 2010-01-01 | 09:19:47 | ID00001 | Female | 44 | Indonesia | Ciawi | West Java | Colombia | ... | 2010-01-22 | 2 | Enchanted Isle | 4.2 | Wallet | EZ-Link | 312 | 0.00 | 0.07 | 0.24 |
| 2 | DDSG00003 | 2010-01-01 | 11:52:56 | SG00001 | Female | 31 | Singapore | Central | Rochor | Germany | ... | 2010-01-09 | 2 | Seacoast Hotel | 4.5 | Credit Card | Grab | 338 | 0.19 | 0.07 | 0.20 |
| 3 | DDSG00004 | 2010-01-01 | 13:44:40 | SG00002 | Male | 28 | Singapore | North | Yishun | Canada | ... | 2010-02-19 | 2 | Night In Paradise | 4.2 | Debit Card | DBS Paylah | 254 | 0.19 | 0.07 | 0.13 |
| 4 | DDKH00005 | 2010-01-01 | 05:38:26 | KH00001 | Male | 44 | Cambodia | Phnom Trop | Pursat | Kenya | ... | 2010-01-06 | 3 | Tiny Digs Hotel | 4.6 | Wallet | Bank of Singapore (BOS) | 313 | 0.15 | 0.07 | 0.17 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66536 | DDSG66531 | 2019-12-31 | 23:36:16 | SG12034 | Female | 42 | Singapore | Central | Orchard | Germany | ... | 2020-01-10 | 2 | Silver Cloud Inn | 4.3 | Wallet | Grab | 182 | 0.23 | 0.07 | 0.24 |
| 66537 | DDSG66532 | 2019-12-31 | 14:41:01 | SG12035 | Female | 54 | Singapore | Central | Geylang | Israel | ... | 2020-04-13 | 2 | The Elet | 4.2 | COD | DBS Paylah | 125 | 0.06 | 0.07 | 0.19 |
| 66538 | DDSG66533 | 2019-12-31 | 19:11:16 | SG12036 | Female | 57 | Singapore | Central | Downtown Core | Canada | ... | 2020-01-10 | 4 | The Elet | 4.4 | Debit Card | EZ-Link | 318 | 0.02 | 0.07 | 0.22 |
| 66539 | DDTH66534 | 2019-12-31 | 05:12:29 | TH12170 | Female | 44 | Thailand | Surat Thani | Ko Samui | Maldives | ... | 2020-01-02 | 2 | Sunset Lodge | 4.2 | Debit Card | HSBC | 173 | 0.14 | 0.07 | 0.25 |
| 66540 | DDVN66535 | 2019-12-31 | 00:51:52 | VN05959 | Female | 52 | Vietnam | Pleiku | Gia Lai | Egypt | ... | 2020-01-28 | 3 | Coastal bay hotel | 4.3 | Internet Banking | Grab | 182 | 0.17 | 0.07 | 0.24 |
66541 rows × 24 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 66541 entries, 0 to 66540 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Booking ID 66541 non-null object 1 Date of Booking 66541 non-null datetime64[ns] 2 Time 66541 non-null object 3 Customer ID 66541 non-null object 4 Gender 66541 non-null object 5 Age 66541 non-null int64 6 Origin Country 66541 non-null object 7 State 66541 non-null object 8 Location 66541 non-null object 9 Destination Country 66541 non-null object 10 Destination City 66541 non-null object 11 No. Of People 66541 non-null int64 12 Check-in date 66541 non-null datetime64[ns] 13 No of Days 0 non-null float64 14 Check-Out Date 66541 non-null datetime64[ns] 15 Rooms 66541 non-null int64 16 Hotel Name 66541 non-null object 17 Hotel Rating 66541 non-null float64 18 Payment Mode 66541 non-null object 19 Bank Name 66541 non-null object 20 Booking Price[SGD] 66541 non-null int64 21 Discount 66541 non-null float64 22 GST 66541 non-null float64 23 Profit Margin 66541 non-null float64 dtypes: datetime64[ns](3), float64(5), int64(4), object(12) memory usage: 12.2+ MB
data.isnull().sum()
Booking ID 0 Date of Booking 0 Time 0 Customer ID 0 Gender 0 Age 0 Origin Country 0 State 0 Location 0 Destination Country 0 Destination City 0 No. Of People 0 Check-in date 0 No of Days 66541 Check-Out Date 0 Rooms 0 Hotel Name 0 Hotel Rating 0 Payment Mode 0 Bank Name 0 Booking Price[SGD] 0 Discount 0 GST 0 Profit Margin 0 dtype: int64
data.columns
data.nunique()
Booking ID 66536 Date of Booking 3652 Time 39308 Customer ID 66536 Gender 2 Age 41 Origin Country 7 State 219 Location 228 Destination Country 20 Destination City 120 No. Of People 7 Check-in date 3747 No of Days 0 Check-Out Date 3748 Rooms 4 Hotel Name 614 Hotel Rating 11 Payment Mode 5 Bank Name 9 Booking Price[SGD] 491 Discount 26 GST 1 Profit Margin 17 dtype: int64
data.columns
Index(['Booking ID', 'Date of Booking', 'Time', 'Customer ID', 'Gender', 'Age',
'Origin Country', 'State', 'Location', 'Destination Country',
'Destination City', 'No. Of People', 'Check-in date', 'No of Days',
'Check-Out Date', 'Rooms', 'Hotel Name', 'Hotel Rating', 'Payment Mode',
'Bank Name', 'Booking Price[SGD]', 'Discount', 'GST', 'Profit Margin'],
dtype='object')
data['Profit Margin'].sum()
12564.040000000003
data1 = data.drop(['Booking ID','Customer ID','GST','State','Location','Bank Name','No of Days',], axis = 1)
data1 is the new data after droping of some of the unneeded data variables
data1
| Date of Booking | Time | Gender | Age | Origin Country | Destination Country | Destination City | No. Of People | Check-in date | Check-Out Date | Rooms | Hotel Name | Hotel Rating | Payment Mode | Booking Price[SGD] | Discount | Profit Margin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2010-01-01 | 10:49:40 | Male | 42 | Malaysia | Denmark | Horsens | 1 | 2010-01-12 | 2010-01-20 | 1 | Hotel Triton | 4.3 | Wallet | 243 | 0.01 | 0.25 |
| 1 | 2010-01-01 | 09:19:47 | Female | 44 | Indonesia | Colombia | Medellin | 3 | 2010-01-21 | 2010-01-22 | 2 | Enchanted Isle | 4.2 | Wallet | 312 | 0.00 | 0.24 |
| 2 | 2010-01-01 | 11:52:56 | Female | 31 | Singapore | Germany | Munich | 3 | 2010-01-02 | 2010-01-09 | 2 | Seacoast Hotel | 4.5 | Credit Card | 338 | 0.19 | 0.20 |
| 3 | 2010-01-01 | 13:44:40 | Male | 28 | Singapore | Canada | Montreal | 3 | 2010-02-15 | 2010-02-19 | 2 | Night In Paradise | 4.2 | Debit Card | 254 | 0.19 | 0.13 |
| 4 | 2010-01-01 | 05:38:26 | Male | 44 | Cambodia | Kenya | Nairobi | 5 | 2010-01-03 | 2010-01-06 | 3 | Tiny Digs Hotel | 4.6 | Wallet | 313 | 0.15 | 0.17 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66536 | 2019-12-31 | 23:36:16 | Female | 42 | Singapore | Germany | Berlin | 4 | 2020-01-06 | 2020-01-10 | 2 | Silver Cloud Inn | 4.3 | Wallet | 182 | 0.23 | 0.24 |
| 66537 | 2019-12-31 | 14:41:01 | Female | 54 | Singapore | Israel | Holon | 4 | 2020-04-09 | 2020-04-13 | 2 | The Elet | 4.2 | COD | 125 | 0.06 | 0.19 |
| 66538 | 2019-12-31 | 19:11:16 | Female | 57 | Singapore | Canada | Ottawa | 7 | 2020-01-09 | 2020-01-10 | 4 | The Elet | 4.4 | Debit Card | 318 | 0.02 | 0.22 |
| 66539 | 2019-12-31 | 05:12:29 | Female | 44 | Thailand | Maldives | Viligili | 3 | 2020-01-01 | 2020-01-02 | 2 | Sunset Lodge | 4.2 | Debit Card | 173 | 0.14 | 0.25 |
| 66540 | 2019-12-31 | 00:51:52 | Female | 52 | Vietnam | Egypt | Luxor | 5 | 2020-01-24 | 2020-01-28 | 3 | Coastal bay hotel | 4.3 | Internet Banking | 182 | 0.17 | 0.24 |
66541 rows × 17 columns
data1.columns = data1.columns.str.replace (' ','_') # Replacing the space (s) in the column names with underscore
data
| Booking ID | Date of Booking | Time | Customer ID | Gender | Age | Origin Country | State | Location | Destination Country | ... | Check-Out Date | Rooms | Hotel Name | Hotel Rating | Payment Mode | Bank Name | Booking Price[SGD] | Discount | GST | Profit Margin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DDMY00001 | 2010-01-01 | 10:49:40 | MY00001 | Male | 42 | Malaysia | Johor | Iskandar Puteri | Denmark | ... | 2010-01-20 | 1 | Hotel Triton | 4.3 | Wallet | United Overseas Bank (UOB) | 243 | 0.01 | 0.07 | 0.25 |
| 1 | DDID00002 | 2010-01-01 | 09:19:47 | ID00001 | Female | 44 | Indonesia | Ciawi | West Java | Colombia | ... | 2010-01-22 | 2 | Enchanted Isle | 4.2 | Wallet | EZ-Link | 312 | 0.00 | 0.07 | 0.24 |
| 2 | DDSG00003 | 2010-01-01 | 11:52:56 | SG00001 | Female | 31 | Singapore | Central | Rochor | Germany | ... | 2010-01-09 | 2 | Seacoast Hotel | 4.5 | Credit Card | Grab | 338 | 0.19 | 0.07 | 0.20 |
| 3 | DDSG00004 | 2010-01-01 | 13:44:40 | SG00002 | Male | 28 | Singapore | North | Yishun | Canada | ... | 2010-02-19 | 2 | Night In Paradise | 4.2 | Debit Card | DBS Paylah | 254 | 0.19 | 0.07 | 0.13 |
| 4 | DDKH00005 | 2010-01-01 | 05:38:26 | KH00001 | Male | 44 | Cambodia | Phnom Trop | Pursat | Kenya | ... | 2010-01-06 | 3 | Tiny Digs Hotel | 4.6 | Wallet | Bank of Singapore (BOS) | 313 | 0.15 | 0.07 | 0.17 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66536 | DDSG66531 | 2019-12-31 | 23:36:16 | SG12034 | Female | 42 | Singapore | Central | Orchard | Germany | ... | 2020-01-10 | 2 | Silver Cloud Inn | 4.3 | Wallet | Grab | 182 | 0.23 | 0.07 | 0.24 |
| 66537 | DDSG66532 | 2019-12-31 | 14:41:01 | SG12035 | Female | 54 | Singapore | Central | Geylang | Israel | ... | 2020-04-13 | 2 | The Elet | 4.2 | COD | DBS Paylah | 125 | 0.06 | 0.07 | 0.19 |
| 66538 | DDSG66533 | 2019-12-31 | 19:11:16 | SG12036 | Female | 57 | Singapore | Central | Downtown Core | Canada | ... | 2020-01-10 | 4 | The Elet | 4.4 | Debit Card | EZ-Link | 318 | 0.02 | 0.07 | 0.22 |
| 66539 | DDTH66534 | 2019-12-31 | 05:12:29 | TH12170 | Female | 44 | Thailand | Surat Thani | Ko Samui | Maldives | ... | 2020-01-02 | 2 | Sunset Lodge | 4.2 | Debit Card | HSBC | 173 | 0.14 | 0.07 | 0.25 |
| 66540 | DDVN66535 | 2019-12-31 | 00:51:52 | VN05959 | Female | 52 | Vietnam | Pleiku | Gia Lai | Egypt | ... | 2020-01-28 | 3 | Coastal bay hotel | 4.3 | Internet Banking | Grab | 182 | 0.17 | 0.07 | 0.24 |
66541 rows × 24 columns
# Convert date columns to datetime objects
data1['Check-Out_Date'] = pd.to_datetime(data1['Check-Out_Date'])
data1['Check-in_date'] = pd.to_datetime(data1['Check-in_date'])
# Create number of days spent column by subtracting booking date from checkin date
data1['No_of_Days'] = data1['Check-Out_Date'] - data1['Check-in_date']
#print new data1
data1.head()
| Date_of_Booking | Time | Gender | Age | Origin_Country | Destination_Country | Destination_City | No._Of_People | Check-in_date | Check-Out_Date | Rooms | Hotel_Name | Hotel_Rating | Payment_Mode | Booking_Price[SGD] | Discount | Profit_Margin | No_of_Days | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2010-01-01 | 10:49:40 | Male | 42 | Malaysia | Denmark | Horsens | 1 | 2010-01-12 | 2010-01-20 | 1 | Hotel Triton | 4.3 | Wallet | 243 | 0.01 | 0.25 | 8 days |
| 1 | 2010-01-01 | 09:19:47 | Female | 44 | Indonesia | Colombia | Medellin | 3 | 2010-01-21 | 2010-01-22 | 2 | Enchanted Isle | 4.2 | Wallet | 312 | 0.00 | 0.24 | 1 days |
| 2 | 2010-01-01 | 11:52:56 | Female | 31 | Singapore | Germany | Munich | 3 | 2010-01-02 | 2010-01-09 | 2 | Seacoast Hotel | 4.5 | Credit Card | 338 | 0.19 | 0.20 | 7 days |
| 3 | 2010-01-01 | 13:44:40 | Male | 28 | Singapore | Canada | Montreal | 3 | 2010-02-15 | 2010-02-19 | 2 | Night In Paradise | 4.2 | Debit Card | 254 | 0.19 | 0.13 | 4 days |
| 4 | 2010-01-01 | 05:38:26 | Male | 44 | Cambodia | Kenya | Nairobi | 5 | 2010-01-03 | 2010-01-06 | 3 | Tiny Digs Hotel | 4.6 | Wallet | 313 | 0.15 | 0.17 | 3 days |
#Converting a timestamp column into string so we can extract 'Year as BookingYear', BookingMonth as Month
df=data1['Date_of_Booking'].astype(str)
#extract Year, Month, Date
data1['BookingYear'] =df.apply(lambda x: x.split("-")[0])
data1['BookingMonth'] =df.apply(lambda x: x.split("-")[1])
data1['BookingDay'] =df.apply(lambda x: x.split("-")[2])
#print our new data1
data1
| Date_of_Booking | Time | Gender | Age | Origin_Country | Destination_Country | Destination_City | No._Of_People | Check-in_date | Check-Out_Date | ... | Hotel_Name | Hotel_Rating | Payment_Mode | Booking_Price[SGD] | Discount | Profit_Margin | No_of_Days | BookingYear | BookingMonth | BookingDay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2010-01-01 | 10:49:40 | Male | 42 | Malaysia | Denmark | Horsens | 1 | 2010-01-12 | 2010-01-20 | ... | Hotel Triton | 4.3 | Wallet | 243 | 0.01 | 0.25 | 8 days | 2010 | 01 | 01 |
| 1 | 2010-01-01 | 09:19:47 | Female | 44 | Indonesia | Colombia | Medellin | 3 | 2010-01-21 | 2010-01-22 | ... | Enchanted Isle | 4.2 | Wallet | 312 | 0.00 | 0.24 | 1 days | 2010 | 01 | 01 |
| 2 | 2010-01-01 | 11:52:56 | Female | 31 | Singapore | Germany | Munich | 3 | 2010-01-02 | 2010-01-09 | ... | Seacoast Hotel | 4.5 | Credit Card | 338 | 0.19 | 0.20 | 7 days | 2010 | 01 | 01 |
| 3 | 2010-01-01 | 13:44:40 | Male | 28 | Singapore | Canada | Montreal | 3 | 2010-02-15 | 2010-02-19 | ... | Night In Paradise | 4.2 | Debit Card | 254 | 0.19 | 0.13 | 4 days | 2010 | 01 | 01 |
| 4 | 2010-01-01 | 05:38:26 | Male | 44 | Cambodia | Kenya | Nairobi | 5 | 2010-01-03 | 2010-01-06 | ... | Tiny Digs Hotel | 4.6 | Wallet | 313 | 0.15 | 0.17 | 3 days | 2010 | 01 | 01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66536 | 2019-12-31 | 23:36:16 | Female | 42 | Singapore | Germany | Berlin | 4 | 2020-01-06 | 2020-01-10 | ... | Silver Cloud Inn | 4.3 | Wallet | 182 | 0.23 | 0.24 | 4 days | 2019 | 12 | 31 |
| 66537 | 2019-12-31 | 14:41:01 | Female | 54 | Singapore | Israel | Holon | 4 | 2020-04-09 | 2020-04-13 | ... | The Elet | 4.2 | COD | 125 | 0.06 | 0.19 | 4 days | 2019 | 12 | 31 |
| 66538 | 2019-12-31 | 19:11:16 | Female | 57 | Singapore | Canada | Ottawa | 7 | 2020-01-09 | 2020-01-10 | ... | The Elet | 4.4 | Debit Card | 318 | 0.02 | 0.22 | 1 days | 2019 | 12 | 31 |
| 66539 | 2019-12-31 | 05:12:29 | Female | 44 | Thailand | Maldives | Viligili | 3 | 2020-01-01 | 2020-01-02 | ... | Sunset Lodge | 4.2 | Debit Card | 173 | 0.14 | 0.25 | 1 days | 2019 | 12 | 31 |
| 66540 | 2019-12-31 | 00:51:52 | Female | 52 | Vietnam | Egypt | Luxor | 5 | 2020-01-24 | 2020-01-28 | ... | Coastal bay hotel | 4.3 | Internet Banking | 182 | 0.17 | 0.24 | 4 days | 2019 | 12 | 31 |
66541 rows × 21 columns
# Convert Lead time columns by substracting Date_of_Booking from Check_in_Date
data1['Lead_time'] = data1['Check-in_date'] - data1['Date_of_Booking']
#print new data1
data1.head()
| Date_of_Booking | Time | Gender | Age | Origin_Country | Destination_Country | Destination_City | No._Of_People | Check-in_date | Check-Out_Date | ... | Hotel_Rating | Payment_Mode | Booking_Price[SGD] | Discount | Profit_Margin | No_of_Days | BookingYear | BookingMonth | BookingDay | Lead_time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2010-01-01 | 10:49:40 | Male | 42 | Malaysia | Denmark | Horsens | 1 | 2010-01-12 | 2010-01-20 | ... | 4.3 | Wallet | 243 | 0.01 | 0.25 | 8 days | 2010 | 01 | 01 | 11 days |
| 1 | 2010-01-01 | 09:19:47 | Female | 44 | Indonesia | Colombia | Medellin | 3 | 2010-01-21 | 2010-01-22 | ... | 4.2 | Wallet | 312 | 0.00 | 0.24 | 1 days | 2010 | 01 | 01 | 20 days |
| 2 | 2010-01-01 | 11:52:56 | Female | 31 | Singapore | Germany | Munich | 3 | 2010-01-02 | 2010-01-09 | ... | 4.5 | Credit Card | 338 | 0.19 | 0.20 | 7 days | 2010 | 01 | 01 | 1 days |
| 3 | 2010-01-01 | 13:44:40 | Male | 28 | Singapore | Canada | Montreal | 3 | 2010-02-15 | 2010-02-19 | ... | 4.2 | Debit Card | 254 | 0.19 | 0.13 | 4 days | 2010 | 01 | 01 | 45 days |
| 4 | 2010-01-01 | 05:38:26 | Male | 44 | Cambodia | Kenya | Nairobi | 5 | 2010-01-03 | 2010-01-06 | ... | 4.6 | Wallet | 313 | 0.15 | 0.17 | 3 days | 2010 | 01 | 01 | 2 days |
5 rows × 22 columns
data1 = data1.drop(['Date_of_Booking','Check-in_date','Check-Out_Date'], axis = 1)
data1
| Time | Gender | Age | Origin_Country | Destination_Country | Destination_City | No._Of_People | Rooms | Hotel_Name | Hotel_Rating | Payment_Mode | Booking_Price[SGD] | Discount | Profit_Margin | No_of_Days | BookingYear | BookingMonth | BookingDay | Lead_time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10:49:40 | Male | 42 | Malaysia | Denmark | Horsens | 1 | 1 | Hotel Triton | 4.3 | Wallet | 243 | 0.01 | 0.25 | 8 days | 2010 | 01 | 01 | 11 days |
| 1 | 09:19:47 | Female | 44 | Indonesia | Colombia | Medellin | 3 | 2 | Enchanted Isle | 4.2 | Wallet | 312 | 0.00 | 0.24 | 1 days | 2010 | 01 | 01 | 20 days |
| 2 | 11:52:56 | Female | 31 | Singapore | Germany | Munich | 3 | 2 | Seacoast Hotel | 4.5 | Credit Card | 338 | 0.19 | 0.20 | 7 days | 2010 | 01 | 01 | 1 days |
| 3 | 13:44:40 | Male | 28 | Singapore | Canada | Montreal | 3 | 2 | Night In Paradise | 4.2 | Debit Card | 254 | 0.19 | 0.13 | 4 days | 2010 | 01 | 01 | 45 days |
| 4 | 05:38:26 | Male | 44 | Cambodia | Kenya | Nairobi | 5 | 3 | Tiny Digs Hotel | 4.6 | Wallet | 313 | 0.15 | 0.17 | 3 days | 2010 | 01 | 01 | 2 days |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66536 | 23:36:16 | Female | 42 | Singapore | Germany | Berlin | 4 | 2 | Silver Cloud Inn | 4.3 | Wallet | 182 | 0.23 | 0.24 | 4 days | 2019 | 12 | 31 | 6 days |
| 66537 | 14:41:01 | Female | 54 | Singapore | Israel | Holon | 4 | 2 | The Elet | 4.2 | COD | 125 | 0.06 | 0.19 | 4 days | 2019 | 12 | 31 | 100 days |
| 66538 | 19:11:16 | Female | 57 | Singapore | Canada | Ottawa | 7 | 4 | The Elet | 4.4 | Debit Card | 318 | 0.02 | 0.22 | 1 days | 2019 | 12 | 31 | 9 days |
| 66539 | 05:12:29 | Female | 44 | Thailand | Maldives | Viligili | 3 | 2 | Sunset Lodge | 4.2 | Debit Card | 173 | 0.14 | 0.25 | 1 days | 2019 | 12 | 31 | 1 days |
| 66540 | 00:51:52 | Female | 52 | Vietnam | Egypt | Luxor | 5 | 3 | Coastal bay hotel | 4.3 | Internet Banking | 182 | 0.17 | 0.24 | 4 days | 2019 | 12 | 31 | 24 days |
66541 rows × 19 columns
data1.describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Time | 66541 | 39308 | 09:48:22 | 14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Gender | 66541 | 2 | Female | 33388 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Age | 66541.0 | NaN | NaN | NaN | 38.436182 | 11.557562 | -5.0 | 28.0 | 38.0 | 48.0 | 58.0 |
| Origin_Country | 66541 | 7 | Thailand | 12170 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Destination_Country | 66541 | 20 | New Zealand | 3448 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Destination_City | 66541 | 120 | Hamilton | 1084 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| No._Of_People | 66541.0 | NaN | NaN | NaN | 3.995777 | 2.000074 | 1.0 | 2.0 | 4.0 | 6.0 | 7.0 |
| Rooms | 66541.0 | NaN | NaN | NaN | 2.283344 | 1.031134 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 |
| Hotel_Name | 66541 | 614 | Grand Hyatt | 221 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Hotel_Rating | 66541.0 | NaN | NaN | NaN | 4.2885 | 0.275516 | 3.3 | 4.2 | 4.3 | 4.5 | 4.7 |
| Payment_Mode | 66541 | 5 | Internet Banking | 13446 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Booking_Price[SGD] | 66541.0 | NaN | NaN | NaN | 214.046783 | 108.126184 | 35.0 | 125.0 | 199.0 | 293.0 | 578.0 |
| Discount | 66541.0 | NaN | NaN | NaN | 0.12547 | 0.075113 | 0.0 | 0.06 | 0.13 | 0.19 | 0.25 |
| Profit_Margin | 66541.0 | NaN | NaN | NaN | 0.188817 | 0.052765 | 0.1 | 0.14 | 0.2 | 0.23 | 0.3 |
| No_of_Days | 66541 | NaN | NaN | NaN | 3 days 05:36:37.352008536 | 3 days 09:04:06.621354014 | 1 days 00:00:00 | 1 days 00:00:00 | 2 days 00:00:00 | 4 days 00:00:00 | 97 days 00:00:00 |
| BookingYear | 66541 | 10 | 2019 | 9502 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| BookingMonth | 66541 | 12 | 12 | 5747 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| BookingDay | 66541 | 31 | 25 | 2255 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Lead_time | 66541 | NaN | NaN | NaN | 28 days 12:29:07.061210381 | 30 days 05:02:51.843807522 | 1 days 00:00:00 | 4 days 00:00:00 | 15 days 00:00:00 | 51 days 00:00:00 | 100 days 00:00:00 |
# seprating numerical and catigorical variables for easy analysis
cat_cols = data1.select_dtypes(include=['object']).columns
num_cols = data1.select_dtypes(include=np.number).columns.tolist()
print(cat_cols)
print("Numerical Variables: ")
print(num_cols)
Index(['Time', 'Gender', 'Origin_Country', 'Destination_Country',
'Destination_City', 'Hotel_Name', 'Payment_Mode', 'BookingYear',
'BookingMonth', 'BookingDay'],
dtype='object')
Numerical Variables:
['Age', 'No._Of_People', 'Rooms', 'Hotel_Rating', 'Booking_Price[SGD]', 'Discount', 'Profit_Margin', 'No_of_Days', 'Lead_time']
for col in num_cols:
print(col)
print('Skew :', round(data1[col].skew(), 2))
plt.figure(figsize = (15, 4))
plt.subplot(1, 2, 1)
data1[col].hist(grid=False)
plt.ylabel('count')
plt.subplot(1, 2, 2)
sns.boxplot(x=data1[col])
plt.show()
Age Skew : 0.01
No._Of_People Skew : 0.01
Rooms Skew : 0.2
Hotel_Rating Skew : -1.17
Booking_Price[SGD] Skew : 0.39
Discount Skew : -0.01
Profit_Margin Skew : 0.08
No_of_Days
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[19], line 5 1 for col in num_cols: 3 print(col) ----> 5 print('Skew :', round(data1[col].skew(), 2)) 7 plt.figure(figsize = (15, 4)) 9 plt.subplot(1, 2, 1) File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:11577, in NDFrame._add_numeric_operations.<locals>.skew(self, axis, skipna, numeric_only, **kwargs) 11560 @doc( 11561 _num_doc, 11562 desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.", (...) 11575 **kwargs, 11576 ): > 11577 return NDFrame.skew(self, axis, skipna, numeric_only, **kwargs) File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:11223, in NDFrame.skew(self, axis, skipna, numeric_only, **kwargs) 11216 def skew( 11217 self, 11218 axis: Axis | None = 0, (...) 11221 **kwargs, 11222 ) -> Series | float: > 11223 return self._stat_function( 11224 "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs 11225 ) File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:11158, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs) 11154 nv.validate_stat_func((), kwargs, fname=name) 11156 validate_bool_kwarg(skipna, "skipna", none_allowed=False) > 11158 return self._reduce( 11159 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only 11160 ) File ~\anaconda3\Lib\site-packages\pandas\core\series.py:4656, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds) 4652 self._get_axis_number(axis) 4654 if isinstance(delegate, ExtensionArray): 4655 # dispatch to ExtensionArray interface -> 4656 return delegate._reduce(name, skipna=skipna, **kwds) 4658 else: 4659 # dispatch to numpy arrays 4660 if numeric_only and not is_numeric_dtype(self.dtype): File ~\anaconda3\Lib\site-packages\pandas\core\arrays\base.py:1440, in ExtensionArray._reduce(self, name, skipna, **kwargs) 1438 meth = getattr(self, name, None) 1439 if meth is None: -> 1440 raise TypeError( 1441 f"'{type(self).__name__}' with dtype {self.dtype} " 1442 f"does not support reduction '{name}'" 1443 ) 1444 return meth(skipna=skipna, **kwargs) TypeError: 'TimedeltaArray' with dtype timedelta64[ns] does not support reduction 'skew'
# Function for log transformation of the column
def log_transform(data,col):
for colname in col:
if (data[colname] == 1.0).all():
data[colname + '_log'] = np.log(data[colname]+1)
else:
data1[colname + '_log'] = np.log(data1[colname])
log_transform(data1,['Hotel_Rating'])
#Log transformation of the feature 'Hotel_Rating'
sns.distplot(data1["Hotel_Rating_log"], axlabel="Hotel_Rating_log");
plt.show()
data1
fig, axes = plt.subplots(5, 2, figsize = (100, 150))
sns.set_style('whitegrid')
fig.suptitle('Bar plot for all categorical variables in the dataset')
sns.countplot(ax = axes[0, 0], x = 'Payment_Mode', data = data1, palette = 'RdBu_r',
order = data1['Payment_Mode'].value_counts().head(20).index);
sns.countplot(ax = axes[0, 1], x = 'BookingDay', data = data1, palette = 'rainbow',
order = data1['BookingDay'].value_counts().head(20).index);
sns.countplot(ax = axes[1, 0], x = 'Gender', data = data1, palette="Set1",
order = data1['Gender'].value_counts().head(20).index);
sns.countplot(ax = axes[1, 1], x = 'Hotel_Name', data = data1, palette="PuOr",
order = data1['Hotel_Name'].value_counts().head(20).index);
sns.countplot(ax = axes[2, 0], x = 'Origin_Country', data = data1, color = 'blue',
order = data1['Origin_Country'].value_counts().head(20).index);
sns.countplot(ax = axes[2, 1], x = 'Destination_Country', data = data1, palette="RdBu",
order = data1['Destination_Country'].value_counts().head(20).index);
sns.countplot(ax = axes[3, 0], x = 'BookingYear', data = data1, color = 'red',
order = data1['BookingYear'].value_counts().head(20).index);
sns.countplot(ax = axes[3, 1], x = 'BookingMonth', data = data1, palette="Set2",
order = data1['BookingMonth'].value_counts().head(20).index);
sns.countplot(ax = axes[4, 0], x = 'Destination_City', data = data1, color = 'red',
order = data1['Destination_City'].value_counts().head(20).index);
sns.countplot(ax = axes[4, 1], x = 'Time', data = data1, palette="Set2",
order = data1['Time'].value_counts().head(20).index);
axes[1][1].tick_params(labelrotation=90);
axes[2][0].tick_params(labelrotation=90);
axes[2][1].tick_params(labelrotation=90);
axes[4][0].tick_params(labelrotation=90);
axes[4][1].tick_params(labelrotation=90);
plt.rc('font', size=8)
plt.rc('axes', titlesize=100)
plt.rc('axes', labelsize=20)
plt.rc('xtick', labelsize=80)
plt.rc('ytick', labelsize=80)
plt.rc('legend', fontsize=80)
plt.rc('figure', titlesize=100)
plt.subplots_adjust(hspace=1)
plt.subplots_adjust(wspace=.5)
sns.despine()
num_cols
['Age', 'No._Of_People', 'Rooms', 'Hotel_Rating', 'Booking_Price[SGD]', 'Discount', 'Profit_Margin', 'No_of_Days', 'Lead_time']
figsize = (100, 150)
sns.set_style('whitegrid')
ax = sns.countplot(y='BookingDay', hue = 'Gender', data=data1, palette='colorblind', order = data1['BookingDay'].value_counts().head().index)
total = data1['BookingDay'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='BookingDay', title='BookingDay by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()
figsize = (18, 20)
sns.set_style('whitegrid')
ax = sns.countplot(y='BookingYear', hue = 'Gender', data=data1, palette='Set1', order = data1['BookingYear'].value_counts().head().index)
total = data1['BookingYear'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='BookingYear', title='BookingYear by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()
figsize = (18, 20)
sns.set_style('whitegrid')
ax = sns.countplot(y='Booking_Price[SGD]', hue = 'Gender', data=data1, palette='Set2', order = data1['Booking_Price[SGD]'].value_counts().head().index)
total = data1['Booking_Price[SGD]'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='Booking_Price[SGD]', title='Booking_Price by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show
figsize = (18, 20)
sns.set_style('whitegrid')
ax = sns.countplot(y='Payment_Mode', hue = 'Gender', data=data1, palette='rainbow', order = data1['Payment_Mode'].value_counts().head().index)
total = data1['Payment_Mode'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='Payment_Mode', title='Payment_Mode by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()
figsize = (18, 20)
sns.set_style('whitegrid')
ax = sns.countplot(y='Hotel_Rating', hue = 'Gender', data=data1, palette='Set3', order = data1['Hotel_Rating'].value_counts().head().index)
total = data1['Hotel_Rating'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='Hotel_Rating', title='Hotel_Rating by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()
figsize = (18, 20)
sns.set_style('whitegrid')
ax = sns.countplot(y='Hotel_Name', hue = 'Gender', data=data1, palette='PuOr', order = data1['Hotel_Name'].value_counts().head().index)
total = data1['Hotel_Name'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='Hotel_Name', title='Hotel_Name by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()
figsize = (18, 20)
sns.set_style('whitegrid')
ax = sns.countplot(y='Destination_Country', hue = 'Gender', data=data1, palette="Paired", order = data1['Destination_Country'].value_counts().head().index)
total = data1['Destination_Country'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='Destination_Country', title='Destination_Country by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()
figsize = (18, 20)
sns.set_style('whitegrid')
ax = sns.countplot(y='Age', hue = 'Gender', data=data1, palette='PuOr', order = data1['Age'].value_counts().head().index)
total = data1['Age'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='Age', title='Age by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()
figsize = (18, 20)
sns.set_style('whitegrid')
ax = sns.countplot(y='Origin_Country', hue = 'Gender', data=data1, palette='colorblind', order = data1['Origin_Country'].value_counts().head().index)
total = data1['Origin_Country'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='Origin_Country', title='Origin_Country by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()
figsize = (18, 20)
sns.set_style('whitegrid')
ax = sns.countplot(y='Lead_time', hue = 'Gender', data=data1, palette='colorblind', order = data1['Lead_time'].value_counts().head().index)
total = data1['Lead_time'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='Lead_time', title='Lead_time by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=8)
plt.tight_layout()
plt.show()
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:15: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:38: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:59: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7) C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:80: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7) C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:82: UserWarning: The figure layout has changed to tight plt.tight_layout()
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:102: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:124: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:145: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:166: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:187: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:208: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=8)
figsize = (100, 150)
sns.set_style('whitegrid')
ax = sns.countplot(y='BookingDay', hue = 'Gender', data=data1, palette='colorblind', order = data1['BookingDay'].value_counts().head().index)
total = data1['BookingDay'].count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')
ax.margins(x=0.1)
ax.set(xlabel='Count', ylabel='BookingDay', title='BookingDay by Gender')
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.rc('font', size=15)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=35)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.rc('legend', fontsize=15)
plt.rc('figure', titlesize=20)
plt.subplots_adjust(hspace=1)
plt.subplots_adjust(wspace=.5)
sns.despine()
plt.tight_layout()
plt.show()
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\4052684459.py:15: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.figure(figsize=(20,30))
sns.pairplot(data1)
plt.show()
C:\Users\Topsheed\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
<Figure size 2000x3000 with 0 Axes>
fig, axarr = plt.subplots(5, 2, figsize=(12, 18))
data1.groupby('Payment_Mode')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[0][0], fontsize=12)
axarr[0][0].set_title("Payment_Mode Vs Profit_Margin", fontsize=18)
data1.groupby('Origin_Country')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[0][1], fontsize=12)
axarr[0][1].set_title("Origin_Country Vs Profit_Margin", fontsize=18)
data1.groupby('Destination_Country')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[1][0], fontsize=12)
axarr[1][0].set_title("Destination_Country Vs Profit_Margin", fontsize=18)
data1.groupby('BookingYear')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[1][1], fontsize=12)
axarr[1][1].set_title("BookingYear Vs Profit_Margin", fontsize=18)
data1.groupby('BookingMonth')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[2][0], fontsize=12)
axarr[2][0].set_title("Booking month Vs Profit_Margin", fontsize=18)
data1.groupby('BookingDay')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[2][1], fontsize=12)
axarr[2][1].set_title("Booking day Vs Profit_Margin", fontsize=18)
data1.groupby('Gender')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[3][0], fontsize=12)
axarr[3][0].set_title("Gender Vs Profit_Margin", fontsize=18)
data1.groupby('Destination_City')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[3][1], fontsize=12)
axarr[3][1].set_title("Destination city Vs Profit_Margin", fontsize=18)
data1.groupby('Time')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[4][0], fontsize=12)
axarr[4][0].set_title("Time Vs Profit_Margin", fontsize=18)
data1.groupby('Hotel_Name')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[4][1], fontsize=12)
axarr[4][1].set_title("Hotel_Name Vs Profit_Margin", fontsize=18)
plt.subplots_adjust(hspace=1.2)
plt.subplots_adjust(wspace=.5)
sns.despine()
* Working on Several Variable
plt.figure(figsize=(12, 7))
sns.heatmap(data1.drop(['Time', 'Gender', 'Origin_Country', 'Destination_Country',
'Destination_City', 'Hotel_Name', 'Payment_Mode', 'BookingYear',
'BookingMonth', 'BookingDay'],axis=1).corr(), annot = True, vmin = -1, vmax = 1)
plt.show()
data1['Origin_Country'].count()
66541